Dual CRISPR Screen Analysis

Count Combination

Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

Instructions

To run this notebook reproducibly, follow these steps:

  1. Click Kernel > Restart & Clear Output
  2. When prompted, click the red Restart & clear all outputs button
  3. Fill in the values for your analysis for each of the variables in the Input Parameters section
  4. Click Cell > Run All

Input Parameters


In [ ]:
g_timestamp = ""
g_dataset_name = "20160510_A549"
g_count_alg_name = "19mer_1mm_py"
g_fastq_counts_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/interim/20160510_D00611_0278_BHK55CBCXX_A549'
g_fastq_counts_run_prefix = "19mer_1mm_py_20160615223822"
g_collapsed_counts_dir = "/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/processed/20160510_A549"
g_collapsed_counts_run_prefix = ""
g_combined_counts_dir = ""
g_combined_counts_run_prefix = ""
g_code_location = "/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python"

CCBB Library Imports


In [ ]:
import sys
sys.path.append(g_code_location)

Automated Set-Up


In [ ]:
# %load -s describe_var_list /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/utilities/analysis_run_prefixes.py
def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)

In [ ]:
from ccbbucsd.utilities.analysis_run_prefixes import check_or_set, get_run_prefix, get_timestamp
g_timestamp = check_or_set(g_timestamp, get_timestamp())
g_collapsed_counts_dir = check_or_set(g_collapsed_counts_dir, g_fastq_counts_dir)
g_collapsed_counts_run_prefix = check_or_set(g_collapsed_counts_run_prefix, 
                                             get_run_prefix(g_dataset_name, g_count_alg_name, g_timestamp))
g_combined_counts_dir = check_or_set(g_combined_counts_dir, g_collapsed_counts_dir)
g_combined_counts_run_prefix = check_or_set(g_combined_counts_run_prefix, g_collapsed_counts_run_prefix)
print(describe_var_list(['g_timestamp','g_collapsed_counts_dir','g_collapsed_counts_run_prefix', 
                         'g_combined_counts_dir', 'g_combined_counts_run_prefix']))

In [ ]:
from ccbbucsd.utilities.files_and_paths import verify_or_make_dir
verify_or_make_dir(g_collapsed_counts_dir)
verify_or_make_dir(g_combined_counts_dir)

Count Combination Functions


In [ ]:
# %load -s get_counts_file_suffix /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/construct_counter.py
def get_counts_file_suffix():
    return "counts.txt"

In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/count_combination.py
# ccbb libraries
from ccbbucsd.utilities.analysis_run_prefixes import strip_run_prefix
from ccbbucsd.utilities.files_and_paths import build_multipart_fp, group_files, get_filepaths_by_prefix_and_suffix

# project-specific libraries
from ccbbucsd.malicrispr.count_files_and_dataframes import get_counts_df

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"


def get_collapsed_counts_file_suffix():
    return "collapsed.txt"


def get_combined_counts_file_suffix():
    return "counts_combined.txt"


def group_lane_and_set_files(filepaths):
    # NB: this regex assumes read designator has *already* been removed
    # and replaced with _ as done by group_read_pairs
    return group_files(filepaths, "_L\d\d\d_\d\d\d", "")


def combine_count_files(counts_fp_for_dataset, run_prefix):
    combined_df = None
    
    for curr_counts_fp in counts_fp_for_dataset:
        count_header, curr_counts_df = get_counts_df(curr_counts_fp, run_prefix)
        
        if combined_df is None:
            combined_df = curr_counts_df
        else:
            combined_df[count_header] = curr_counts_df[count_header]
    
    return combined_df


def write_collapsed_count_files(input_dir, output_dir, curr_run_prefix, counts_run_prefix, counts_suffix, counts_collapsed_file_suffix):
    counts_fps_for_dataset = get_filepaths_by_prefix_and_suffix(input_dir, counts_run_prefix, counts_suffix)
    fps_by_sample = group_lane_and_set_files(counts_fps_for_dataset)
    
    for curr_sample, curr_fps in fps_by_sample.items():
        stripped_sample = strip_run_prefix(curr_sample, counts_run_prefix)
        output_fp = build_multipart_fp(output_dir, [curr_run_prefix, stripped_sample, counts_collapsed_file_suffix]) 
        combined_df = None        
        
        for curr_fp in curr_fps:
            count_header, curr_counts_df = get_counts_df(curr_fp, counts_run_prefix)
        
            if combined_df is None:
                combined_df = curr_counts_df
                combined_df.rename(columns = {count_header:stripped_sample}, inplace = True) 
            else:
                combined_df[stripped_sample] = combined_df[stripped_sample] + curr_counts_df[count_header]
    
        combined_df.to_csv(output_fp, sep="\t", index=False)    


def write_combined_count_file(input_dir, output_dir, curr_run_prefix, counts_run_prefix, counts_suffix, combined_suffix):
    output_fp = build_multipart_fp(output_dir, [curr_run_prefix, combined_suffix])
    counts_fps_for_run = get_filepaths_by_prefix_and_suffix(input_dir, counts_run_prefix, counts_suffix)
    combined_df = combine_count_files(counts_fps_for_run, curr_run_prefix)
    combined_df.to_csv(output_fp, sep="\t", index=False)

Input Count Filenames


In [ ]:
from ccbbucsd.utilities.files_and_paths import summarize_filenames_for_prefix_and_suffix
print(summarize_filenames_for_prefix_and_suffix(g_fastq_counts_dir, g_fastq_counts_run_prefix, get_counts_file_suffix()))

Count Combination Execution


In [ ]:
write_collapsed_count_files(g_fastq_counts_dir, g_collapsed_counts_dir, g_collapsed_counts_run_prefix, 
                            g_fastq_counts_run_prefix, get_counts_file_suffix(), get_collapsed_counts_file_suffix())

In [ ]:
write_combined_count_file(g_collapsed_counts_dir, g_combined_counts_dir, g_collapsed_counts_run_prefix, 
                          g_combined_counts_run_prefix, get_collapsed_counts_file_suffix(), 
                          get_combined_counts_file_suffix())